/******************************************************************************
 * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/******************************************************************************
 * Test evaluation for caching allocator of device memory
 ******************************************************************************/

// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR

#include <cub/util_allocator.cuh>
#include <cub/util_device.cuh>

#include <stdio.h>

#include "test_util.h"

using namespace cub;

//---------------------------------------------------------------------
// Main
//---------------------------------------------------------------------

/**
 * Main
 */
int main(int argc, char** argv)
{
  // Initialize command line
  CommandLineArgs args(argc, argv);

  // Print usage
  if (args.CheckCmdLineFlag("help"))
  {
    printf("%s "
           "[--device=<device-id>]"
           "[--bytes=<timing bytes>]"
           "[--i=<timing iterations>]"
           "\n",
           argv[0]);
    exit(0);
  }

  // Initialize device
  CubDebugExit(args.DeviceInit());

  // Get number of GPUs and current GPU
  int num_gpus;
  int initial_gpu;
  int timing_iterations = 10000;
  int timing_bytes      = 1024 * 1024;

  if (CubDebug(cudaGetDeviceCount(&num_gpus)))
  {
    exit(1);
  }
  if (CubDebug(cudaGetDevice(&initial_gpu)))
  {
    exit(1);
  }
  args.GetCmdLineArgument("i", timing_iterations);
  args.GetCmdLineArgument("bytes", timing_bytes);

  // Create default allocator (caches up to 6MB in device allocations per GPU)
  CachingDeviceAllocator allocator;
  allocator.debug = true;

  printf("Running single-gpu tests...\n");
  fflush(stdout);

  //
  // Test0
  //

  // Create a new stream
  cudaStream_t other_stream;
  CubDebugExit(cudaStreamCreate(&other_stream));

  // Allocate 999 bytes on the current gpu in stream0
  char* d_999B_stream0_a;
  char* d_999B_stream0_b;
  CubDebugExit(allocator.DeviceAllocate((void**) &d_999B_stream0_a, 999, 0));

  // Run some big kernel in stream 0
  cub::detail::EmptyKernel<void><<<32000, 512, 1024 * 8, 0>>>();

  // Free d_999B_stream0_a
  CubDebugExit(allocator.DeviceFree(d_999B_stream0_a));

  // Allocate another 999 bytes in stream 0
  CubDebugExit(allocator.DeviceAllocate((void**) &d_999B_stream0_b, 999, 0));

  // Check that that we have 1 live block on the initial GPU
  AssertEquals(allocator.live_blocks.size(), 1);

  // Check that that we have no cached block on the initial GPU
  AssertEquals(allocator.cached_blocks.size(), 0);

  // Run some big kernel in stream 0
  cub::detail::EmptyKernel<void><<<32000, 512, 1024 * 8, 0>>>();

  // Free d_999B_stream0_b
  CubDebugExit(allocator.DeviceFree(d_999B_stream0_b));

  // Allocate 999 bytes on the current gpu in other_stream
  char* d_999B_stream_other_a;
  char* d_999B_stream_other_b;
  allocator.DeviceAllocate((void**) &d_999B_stream_other_a, 999, other_stream);

  // Check that that we have 1 live blocks on the initial GPU (that we allocated a new one because d_999B_stream0_b is
  // only available for stream 0 until it becomes idle)
  AssertEquals(allocator.live_blocks.size(), 1);

  // Check that that we have one cached block on the initial GPU
  AssertEquals(allocator.cached_blocks.size(), 1);

  // Run some big kernel in other_stream
  cub::detail::EmptyKernel<void><<<32000, 512, 1024 * 8, other_stream>>>();

  // Free d_999B_stream_other
  CubDebugExit(allocator.DeviceFree(d_999B_stream_other_a));

  // Check that we can now use both allocations in stream 0 after synchronizing the device
  CubDebugExit(cudaDeviceSynchronize());
  CubDebugExit(allocator.DeviceAllocate((void**) &d_999B_stream0_a, 999, 0));
  CubDebugExit(allocator.DeviceAllocate((void**) &d_999B_stream0_b, 999, 0));

  // Check that that we have 2 live blocks on the initial GPU
  AssertEquals(allocator.live_blocks.size(), 2);

  // Check that that we have no cached block on the initial GPU
  AssertEquals(allocator.cached_blocks.size(), 0);

  // Free d_999B_stream0_a and d_999B_stream0_b
  CubDebugExit(allocator.DeviceFree(d_999B_stream0_a));
  CubDebugExit(allocator.DeviceFree(d_999B_stream0_b));

  // Check that we can now use both allocations in other_stream
  CubDebugExit(cudaDeviceSynchronize());
  CubDebugExit(allocator.DeviceAllocate((void**) &d_999B_stream_other_a, 999, other_stream));
  CubDebugExit(allocator.DeviceAllocate((void**) &d_999B_stream_other_b, 999, other_stream));

  // Check that that we have 2 live blocks on the initial GPU
  AssertEquals(allocator.live_blocks.size(), 2);

  // Check that that we have no cached block on the initial GPU
  AssertEquals(allocator.cached_blocks.size(), 0);

  // Run some big kernel in other_stream
  cub::detail::EmptyKernel<void><<<32000, 512, 1024 * 8, other_stream>>>();

  // Free d_999B_stream_other_a and d_999B_stream_other_b
  CubDebugExit(allocator.DeviceFree(d_999B_stream_other_a));
  CubDebugExit(allocator.DeviceFree(d_999B_stream_other_b));

  // Check that we can now use both allocations in stream 0 after synchronizing the device and destroying the other
  // stream
  CubDebugExit(cudaDeviceSynchronize());
  CubDebugExit(cudaStreamDestroy(other_stream));
  CubDebugExit(allocator.DeviceAllocate((void**) &d_999B_stream0_a, 999, 0));
  CubDebugExit(allocator.DeviceAllocate((void**) &d_999B_stream0_b, 999, 0));

  // Check that that we have 2 live blocks on the initial GPU
  AssertEquals(allocator.live_blocks.size(), 2);

  // Check that that we have no cached block on the initial GPU
  AssertEquals(allocator.cached_blocks.size(), 0);

  // Free d_999B_stream0_a and d_999B_stream0_b
  CubDebugExit(allocator.DeviceFree(d_999B_stream0_a));
  CubDebugExit(allocator.DeviceFree(d_999B_stream0_b));

  // Free all cached
  CubDebugExit(allocator.FreeAllCached());

  //
  // Test1
  //

  // Allocate 5 bytes on the current gpu
  char* d_5B;
  CubDebugExit(allocator.DeviceAllocate((void**) &d_5B, 5));

  // Check that that we have zero free bytes cached on the initial GPU
  AssertEquals(allocator.cached_bytes[initial_gpu].free, 0);

  // Check that that we have 1 live block on the initial GPU
  AssertEquals(allocator.live_blocks.size(), 1);

  //
  // Test2
  //

  // Allocate 4096 bytes on the current gpu
  char* d_4096B;
  CubDebugExit(allocator.DeviceAllocate((void**) &d_4096B, 4096));

  // Check that that we have 2 live blocks on the initial GPU
  AssertEquals(allocator.live_blocks.size(), 2);

  //
  // Test3
  //

  // DeviceFree d_5B
  CubDebugExit(allocator.DeviceFree(d_5B));

  // Check that that we have min_bin_bytes free bytes cached on the initial gpu
  AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes);

  // Check that that we have 1 live block on the initial GPU
  AssertEquals(allocator.live_blocks.size(), 1);

  // Check that that we have 1 cached block on the initial GPU
  AssertEquals(allocator.cached_blocks.size(), 1);

  //
  // Test4
  //

  // DeviceFree d_4096B
  CubDebugExit(allocator.DeviceFree(d_4096B));

  // Check that that we have the 4096 + min_bin free bytes cached on the initial gpu
  AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes + 4096);

  // Check that that we have 0 live block on the initial GPU
  AssertEquals(allocator.live_blocks.size(), 0);

  // Check that that we have 2 cached block on the initial GPU
  AssertEquals(allocator.cached_blocks.size(), 2);

  //
  // Test5
  //

  // Allocate 768 bytes on the current gpu
  char* d_768B;
  CubDebugExit(allocator.DeviceAllocate((void**) &d_768B, 768));

  // Check that that we have the min_bin free bytes cached on the initial gpu (4096 was reused)
  AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes);

  // Check that that we have 1 live block on the initial GPU
  AssertEquals(allocator.live_blocks.size(), 1);

  // Check that that we have 1 cached block on the initial GPU
  AssertEquals(allocator.cached_blocks.size(), 1);

  //
  // Test6
  //

  // Allocate max_cached_bytes on the current gpu
  char* d_max_cached;
  CubDebugExit(allocator.DeviceAllocate((void**) &d_max_cached, allocator.max_cached_bytes));

  // DeviceFree d_max_cached
  CubDebugExit(allocator.DeviceFree(d_max_cached));

  // Check that that we have the min_bin free bytes cached on the initial gpu (max cached was not returned because we
  // went over)
  AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes);

  // Check that that we have 1 live block on the initial GPU
  AssertEquals(allocator.live_blocks.size(), 1);

  // Check that that we still have 1 cached block on the initial GPU
  AssertEquals(allocator.cached_blocks.size(), 1);

  //
  // Test7
  //

  // Free all cached blocks on all GPUs
  CubDebugExit(allocator.FreeAllCached());

  // Check that that we have 0 bytes cached on the initial GPU
  AssertEquals(allocator.cached_bytes[initial_gpu].free, 0);

  // Check that that we have 0 cached blocks across all GPUs
  AssertEquals(allocator.cached_blocks.size(), 0);

  // Check that that still we have 1 live block across all GPUs
  AssertEquals(allocator.live_blocks.size(), 1);

  //
  // Test8
  //

  // Allocate max cached bytes + 1 on the current gpu
  char* d_max_cached_plus;
  CubDebugExit(allocator.DeviceAllocate((void**) &d_max_cached_plus, allocator.max_cached_bytes + 1));

  // DeviceFree max cached bytes
  CubDebugExit(allocator.DeviceFree(d_max_cached_plus));

  // DeviceFree d_768B
  CubDebugExit(allocator.DeviceFree(d_768B));

  unsigned int power;
  size_t rounded_bytes;
  allocator.NearestPowerOf(power, rounded_bytes, allocator.bin_growth, 768);

  // Check that that we have 4096 free bytes cached on the initial gpu
  AssertEquals(allocator.cached_bytes[initial_gpu].free, rounded_bytes);

  // Check that that we have 1 cached blocks across all GPUs
  AssertEquals(allocator.cached_blocks.size(), 1);

  // Check that that still we have 0 live block across all GPUs
  AssertEquals(allocator.live_blocks.size(), 0);

  // BUG: find out why these tests fail when one GPU is CDP compliant and the other is not

  if (num_gpus > 1)
  {
    printf("\nRunning multi-gpu tests...\n");
    fflush(stdout);

    //
    // Test9
    //

    // Allocate 768 bytes on the next gpu
    int next_gpu = (initial_gpu + 1) % num_gpus;
    char* d_768B_2;
    CubDebugExit(allocator.DeviceAllocate(next_gpu, (void**) &d_768B_2, 768));

    // DeviceFree d_768B on the next gpu
    CubDebugExit(allocator.DeviceFree(next_gpu, d_768B_2));

    // Re-allocate 768 bytes on the next gpu
    CubDebugExit(allocator.DeviceAllocate(next_gpu, (void**) &d_768B_2, 768));

    // Re-free d_768B on the next gpu
    CubDebugExit(allocator.DeviceFree(next_gpu, d_768B_2));

    // Check that that we have 4096 free bytes cached on the initial gpu
    AssertEquals(allocator.cached_bytes[initial_gpu].free, rounded_bytes);

    // Check that that we have 4096 free bytes cached on the second gpu
    AssertEquals(allocator.cached_bytes[next_gpu].free, rounded_bytes);

    // Check that that we have 2 cached blocks across all GPUs
    AssertEquals(allocator.cached_blocks.size(), 2);

    // Check that that still we have 0 live block across all GPUs
    AssertEquals(allocator.live_blocks.size(), 0);
  }

  //
  // Performance
  //

  printf("\nCPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes);
  fflush(stdout);
  fflush(stderr);

  // CPU performance comparisons vs cached.  Allocate and free a 1MB block 2000 times
  CpuTimer cpu_timer;
  char* d_1024MB  = nullptr;
  allocator.debug = false;

  // Prime the caching allocator and the kernel
  CubDebugExit(allocator.DeviceAllocate((void**) &d_1024MB, timing_bytes));
  CubDebugExit(allocator.DeviceFree(d_1024MB));
  cub::detail::EmptyKernel<void><<<1, 32>>>();

  // CUDA
  cpu_timer.Start();
  for (int i = 0; i < timing_iterations; ++i)
  {
    CubDebugExit(cudaMalloc((void**) &d_1024MB, timing_bytes));
    CubDebugExit(cudaFree(d_1024MB));
  }
  cpu_timer.Stop();
  float cuda_malloc_elapsed_millis = cpu_timer.ElapsedMillis();

  // CUB
  cpu_timer.Start();
  for (int i = 0; i < timing_iterations; ++i)
  {
    CubDebugExit(allocator.DeviceAllocate((void**) &d_1024MB, timing_bytes));
    CubDebugExit(allocator.DeviceFree(d_1024MB));
  }
  cpu_timer.Stop();
  float cub_calloc_elapsed_millis = cpu_timer.ElapsedMillis();

  printf("\t CUB CachingDeviceAllocator allocation CPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate "
         "%.4f ms)\n",
         cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis,
         cuda_malloc_elapsed_millis / timing_iterations,
         cub_calloc_elapsed_millis / timing_iterations);

  // GPU performance comparisons.  Allocate and free a 1MB block 2000 times
  GpuTimer gpu_timer;

  printf("\nGPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes);
  fflush(stdout);
  fflush(stderr);

  // Kernel-only
  gpu_timer.Start();
  for (int i = 0; i < timing_iterations; ++i)
  {
    cub::detail::EmptyKernel<void><<<1, 32>>>();
  }
  gpu_timer.Stop();
  float cuda_empty_elapsed_millis = gpu_timer.ElapsedMillis();

  // CUDA
  gpu_timer.Start();
  for (int i = 0; i < timing_iterations; ++i)
  {
    CubDebugExit(cudaMalloc((void**) &d_1024MB, timing_bytes));
    cub::detail::EmptyKernel<void><<<1, 32>>>();
    CubDebugExit(cudaFree(d_1024MB));
  }
  gpu_timer.Stop();
  cuda_malloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis;

  // CUB
  gpu_timer.Start();
  for (int i = 0; i < timing_iterations; ++i)
  {
    CubDebugExit(allocator.DeviceAllocate((void**) &d_1024MB, timing_bytes));
    cub::detail::EmptyKernel<void><<<1, 32>>>();
    CubDebugExit(allocator.DeviceFree(d_1024MB));
  }
  gpu_timer.Stop();
  cub_calloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis;

  printf("\t CUB CachingDeviceAllocator allocation GPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate "
         "%.4f ms)\n",
         cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis,
         cuda_malloc_elapsed_millis / timing_iterations,
         cub_calloc_elapsed_millis / timing_iterations);

  printf("Success\n");

  return 0;
}
